\documentclass{llncs}
%
\usepackage{makeidx}  % allows for indexgeneration
%
\usepackage{graphicx}
\usepackage{stfloats}
\usepackage{amsmath}
\usepackage{bbm}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{mathrsfs}
\usepackage{url}
\usepackage{listings}
\usepackage{color}
\usepackage{wrapfig}

\lstloadlanguages{XML}


\newcommand{\beq}{\begin{equation}}
\newcommand{\enq}{\end{equation}}
\newcounter{mytempeqncnt}
\newcommand{\bquote}{\begin{quote}}
\newcommand{\equote}{\end{quote}}

\newcommand{\dtr}[1]{\textbf{\textit{#1}$^\textbf{[dtr]}$}}

\begin{document}
\frontmatter          % for the preliminaries
%
\pagestyle{headings}  % switches on printing of running heads
% \addtocmark{Prior information and event spaces} % additional mark in the TOC

\title{SERIMI: Class-based Disambiguation for Effective Instance Matching over Heterogeneous Web Data}

%
\titlerunning{}  % abbreviated title (for running head)
%                                     also used for the TOC unless
%                                     \toctitle is used
\author{Samur Araujo\inst{1}, Duc Thanh Tran\inst{2}, Arjen P. de Vries\inst{1}, Jan Hidders\inst{1}, and Daniel Schwabe\inst{3}}

% Jeffrey Dean \and David Grove \and Craig Chambers \and Kim~B.~Bruce \and
% Elsa Bertino}
% %
\authorrunning{}   % abbreviated author list (for running head)
% %
% %%%% list of authors for the TOC (use if author list has to be modified)
% \tocauthor{Ivar Ekeland, Roger Temam, Jeffrey Dean, David Grove,
% Craig Chambers, Kim B. Bruce, Elisa Bertino}
% %

\institute{Delft University of Technology, PO Box 5031, 2600 GA Delft, the Netherlands \email{{S.F.CardosodeAraujo, A.P.deVries, A.J.H.Hidders}@tudelft.nl}
\and Karlsruher Institute of Technology, Germany \\
\email{ducthanh.tran@kit.edu}
\and Informatics Department, PUC-Rio Rua Marques de Sao Vicente, 225, Rio de Janeiro, Brazil\\
\email{dschwabe@inf.puc-rio.br}
}

\maketitle

\begin{abstract} 
%Instance matching refers to the problem of determining whether two descriptions refer to the same real-world entity. 
%Traditionally, this problem 
Instance matching has been studied with focus on the single-domain setting, 
%where data come from the same or similar datasets, 
while less attention is given to the 
%large and 
heterogeneous environment of the Web, where data comes from different domains and are associated with different schemas. 
%While these single-domain solutions have shown high quality results in enterprise data integration scenarios, 
%The applicability of single-domain solutions to this 
%%large-scale heterogeneous Web of Data 
%setting is less clear. 
%Assumptions implicitly embodied in these solutions no longer apply. 
For this heterogeneous setting, we propose an unsupervised schema-agnostic approach that focuses on the refinement (\emph{disambiguation}) of candidate instances (resulting from blocking).  
%, a preprocesing step). 
% in the Web data setting. 
%This novelty approach uses a specific similarity measure that we propose for this task. 
%It starts with 
Given instances of a source dataset that belong to a class,  
it computes candidates in the target datasets and refines them such that the \emph{remaining matches correspond to the source instances at the class level}. However, no schema knowledge and explicit correspondences between classes in the source and target datasets are required for this. Rather, the disambiguation is performed based on an \emph{instance-based representation of classes} computed online. We evaluated our work using experiments on large-scale real-world datasets provided by a benchmark. 
%, showing promising results. 
The proposed solution outperformed two alternative approaches for instance matching in 70\% of the cases, and in those cases we improved average F-measure by 10\%. 
% than the baseline and 
%These results suggest approach is especially recommended in situations where there is little overlap between the source and target schemas. 
%and the traditional approaches cannot be applicable.
\\\\
\textbf{Keywords}: data integration, RDF interlinking, instance matching, linked data, entity search.
\end{abstract} 

\input{sec-intro}
\input{sec-related}
\input{sec-problem}
\input{sec-approach}
\input{sec-experiment}

\section{Conclusions}
We investigated the instance matching problem in the large and heterogeneous environment of the Web, where data comes from different domains and are associated with different schemas. We proposed SERIMI as a completely unsupervised schema-agnostic approach that focuses on the effective matching of candidate instances (resulting from blocking). Our approach was able to refine the ambiguous matches provided by existing blocking techniques. We evaluated the accuracy of SERIMI based on experiments on large-scale real-world datasets. We outperformed two alternative approaches approaches in 70\% of the cases, and in those cases we improved F1 by 10\% on average. Our approach is especially recommended in situations where there are few overlaps between the source and target schemas (i.e. where traditional single domain approaches are not applicable). As future work, we consider the use of more advanced blocking strategy to further improve the accuracy of candidates that are refined by SERIMI. Also, we will integrate the ideas of single-domain matching into the overall solution so that SERIMI can perform well both in the traditional single-domain and the heterogenous multiple-domain setting. 

\bibliographystyle{abbrv}
\bibliography{eswc}
\end{document}
